********************************************************************************
* clean_cap_gain.do
* Purpose: Build owner-level capital gains datasets for matched and eligible firms.
*
* Uses T2 Schedule 50 (T2s50) shareholder records to identify the owners of
* each firm, then merges in realized capital gains from T1 individual tax returns.
* Collapses to the firm x year level with total capital gains.
*
* Two samples are constructed:
*   1. Matched sample  -> firm_owner.dta       (owners of matched treated/control firms)
*   2. Eligible sample -> eligible_firm_owner.dta (owners of all eligible firms)
*
* Inputs:
*   $data1/T2s50_YYYY.dta        - Shareholder records by firm x year
*   $data/firm_matched_list.dta  - List of matched firm IDs (from matching_firm.do)
*   $data/firm_eligible_list.dta - List of eligible firm IDs (from matching_firm_eligible.do)
*   $data1/T1_YYYY.dta           - Individual tax records (capital gains)
*
* Outputs:
*   $data/firm_owner.dta          - Matched-sample owner panel (firm x year_prior x year)
*   $data/eligible_firm_owner.dta - Eligible-sample owner panel (firm x year)
********************************************************************************

//------------------------------------------------------------------------------
// SECTION 1: OWNER PANEL FOR MATCHED SAMPLE (firm_owner.dta)
//
// For each year, loads T2s50 shareholder records and keeps only shareholders
// of matched treated/control firms. Merges T1 records to get capital
// gains, then collapses to the firm level.
//   cap_tot_net_calc - Total realized capital gains for all owners of the firm
//   treated/pairid   - From firm_matched_list (identifies treatment group and pair)
//------------------------------------------------------------------------------
forvalues y = 2001/2017 {

	use $data1/T2s50_`y', replace

	* Drop records with missing firm or person identifier
	drop if mi(casenum2019) | mi(entid_syn)
	gen year = `y'

	* Keep only shareholders of matched firms (both treated and control)
	joinby entid_syn using $data/firm_matched_list, unmatched(none)

	* Merge in owner's capital gains from T1 individual tax return.
	* update: fills in missing values in master from the using dataset.
	merge m:1 casenum2019 using $data1/T1_`y', keepusing(cap_tot_net_calc) keep(1 3) nogen update

	* Collapse to firm x year_prior x year: sum capital gains
	gcollapse (sum) cap_tot_net_calc (firstnm) treated pairid, by(entid_syn year_prior year)

	compress
	save $data/firm_owner_`y', replace
}

clear
forvalues y = 2001/2017 {
	append using $data/firm_owner_`y', force
	erase $data/firm_owner_`y'.dta
}

* Create a unique firm x year_prior ID for panel structure
gegen firm_id = group(entid_syn year_prior)
gsort firm_id year

compress
save $data/firm_owner, replace


//------------------------------------------------------------------------------
// SECTION 2: ELIGIBLE FIRM LIST PREPARATION
//
// firm_eligible_list.dta has one row per firm x year; drop duplicates to get
// a unique list of eligible firm IDs for the shareholder merge.
// NOTE: The variable is named entid (not entid_syn) in this file.
//------------------------------------------------------------------------------
use $data/firm_eligible_list, clear
duplicates drop entid, force
save $data/firm_eligible_list_new.dta, replace


//------------------------------------------------------------------------------
// SECTION 3: OWNER PANEL FOR ELIGIBLE SAMPLE (eligible_firm_owner.dta)
//
// Same logic as Section 1, but for the broader eligible sample (all firms that
// satisfied selection criteria, whether matched or not).
// Used in Table B4 to compare owner characteristics across groups.
//
// NOTE: An error message may appear ("variable entid_syn does not uniquely
// identify observations in the using data") because firm_eligible_list_new
// may have duplicate entid_syn values. This is expected behavior.
//------------------------------------------------------------------------------
forvalues y = 2001/2017 {

	use $data1/T2s50_`y', replace

	* Drop records with missing identifiers
	drop if mi(casenum2019) | mi(entid_syn)
	gen year = `y'

	* Keep only shareholders of eligible firms
	merge m:1 entid_syn using $data/firm_eligible_list_new, keep(3) nogen
	// NOTE: "variable entid_syn does not uniquely identify observations in the using data"
	// is expected here because firm_eligible_list_new may contain duplicate firm IDs.

	* Merge in capital gains from T1
	merge m:1 casenum2019 using $data1/T1_`y', keepusing(cap_tot_net_calc) keep(1 3) nogen update

	* Collapse to firm x year level
	gcollapse (sum) cap_tot_net_calc, by(entid_syn year)

	compress
	save $data/eligible_firm_owner_`y', replace
}

clear
forvalues y = 2001/2017 {
	append using $data/eligible_firm_owner_`y', force
	erase $data/eligible_firm_owner_`y'.dta
}

gsort entid_syn year

compress
save $data/eligible_firm_owner, replace
